Unlocking single cell spatial omics analyses with scdney
Yue Cao, Andy Tran, Daniel Kim, Dario Strbenac, Nicholas Robertson, Helen Fu, Jean Yang
24 July, 2024
breastCancerIMC.Rmd
## library(devtools)
## library(BiocManager)
library(SingleCellExperiment)
library(ggplot2)
library(scFeatures) ## devtools::install_github("SydneyBioX/scFeatures")
library(ClassifyR) ## BiocManager::install("ClassifyR", dependencies = TRUE)
library(lisaClust)
library(ggthemes)
library(spicyR) ## BiocManager::install("spicyR")
library(dplyr)
library(limma)
library(plotly)
library(scattermore)
library(tidyr)
library(survival)
library(survminer)
library(spatstat)
library(scater)
library(scran)
library(reshape)
theme_set(theme_classic())
# code for plotting purpose
plot_boxplot <- function( feature ){
data_plot <- t(feature)
data_plot <- melt(data_plot )
colnames(data_plot) <- c("X1", "X2", "value")
data_plot$condition <- unlist( lapply( strsplit( as.character( data_plot$X2), "_cond_"), `[`, 2))
p <- ggplot(data_plot, aes( x = X1, y = value , colour = condition)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
return(p)
}
plot_barplot <- function(data , dodge=F ){
data$patient <- unlist( lapply( strsplit( rownames(data ), "_cond_"), `[`, 1))
data$condition <- unlist( lapply( strsplit( rownames(data ), "_cond_"), `[`, 2))
data <- as.data.frame( melt(data, id=c("patient", "condition")) )
p <- ggplot(data , aes( x = patient , y = value , fill = variable) ) +
geom_bar(stat="identity" ) + facet_wrap(~condition, scale="free") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
return (p)
}
draw_dotplot <- function(data_sce, sample, celltype , group , group_of_interest ){
df <- data.frame(colData( data_sce))
# for each patient, calculate the proportion of each cell type in each region.
df_plot <- NULL
for ( thispatient in unique(df[[sample]])){
this_df <- df[df[[sample]]== thispatient, ]
temp_df <- table( this_df$region, this_df[[celltype]] )
temp_df <- temp_df / rowSums(temp_df)
temp_df <- data.frame( temp_df)
temp_df$patient <- thispatient
temp_df$group <- clinical[clinical[[sample]] == thispatient, group ]
df_plot <- rbind(df_plot, temp_df)
}
# for each region, calculate the average proportion of each cell type across all individuals
df_plot <- df_plot %>% dplyr::group_by( Var1 , Var2, group) %>%
summarise(mean_proportion = mean(Freq))
# we are only interested in the short term and long term survival individuals
df_plot <- df_plot [ df_plot$group %in% group_of_interest, ]
p <- ggplot(df_plot, aes(y = Var2, x = Var1 ,colour =mean_proportion , size = mean_proportion ))+
geom_point() +
facet_grid(~group, scales = "free", space = "free" ) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
xlab("Region" ) + ylab("Cell type") + scale_colour_viridis_c()
return(p)
}
draw_region_clustering_result <- function( data_sce, sample , selected_sample ){
# get meta data for a selected patient to visualise
metadata <- colData(data_sce)
metadata <- metadata[metadata[[sample]] == selected_sample , ]
metadata <- data.frame(metadata)
plotlist <- list() # define the list to store images for region highlighting
plotlist_celltype <- list() # define the list to store images for cell-type highlighting
# optional: define a colour palette
# you can also specify your own colour to use in the variable color_codes
tableau_palette <- scale_colour_tableau( palette = "Tableau 20")
color_codes <- tableau_palette$palette(18)
color_codes <- c(color_codes, "darkgrey") # for all other regions apart from region of interest, make the colour grey
names(color_codes) <- c(unique(metadata$description) , "other regions")
# look through each region to highlight the region of interest, as well as the cell type in the region of interest
for ( thisregion in sort(unique(metadata$region))){
# select the region of interest
selected_region_index <- metadata$region == thisregion
# for all other regions, define them as "other regions" so that they will be greyed out
metadata$selected_region <- "other regions"
metadata$selected_region[selected_region_index] <- "selected region"
# for all cell types outside the region of the interest, also make them greyed out
metadata$celltype <- metadata$description
metadata$celltype[!selected_region_index ] <- "other regions"
metadata$celltype <- factor(metadata$celltype, levels = c(unique(metadata$description), "other regions"))
# plot ggplot highlighting the region of interest
p <- ggplot(metadata, aes(x = Location_Center_X , y = Location_Center_Y , colour = selected_region )) + geom_point(alpha = 0.7) + ggtitle(thisregion) + scale_colour_manual(values = c("grey" , "red"))
# plot ggplot highlighting the cell-types in the region of interest
p2 <- ggplot(metadata, aes(x = Location_Center_X , y = Location_Center_Y , colour = celltype )) + geom_point(alpha = 0.7 ) + ggtitle(thisregion) + scale_colour_manual(values = color_codes)
plotlist [[thisregion ]] <- p
plotlist_celltype [[thisregion ]] <- p2
}
a <- ggarrange(plotlist = plotlist , ncol = 5, nrow = 1 , common.legend = T )
b <- ggarrange(plotlist = plotlist_celltype , ncol = 5, nrow = 1 , common.legend = T )
return (list(a,b))
}
draw_selected_region_boxplot <- function(data_sce, sample , celltype, group , group_of_interest, select_region){
df <- clinical[colData(data_sce)[, sample ], ]
df$region <- data_sce$region
df[[celltype]] <- data_sce[[celltype]]
df_plot <- NULL
for ( thispatient in unique(df[[sample]])){
this_df <- df[df[[sample]] == thispatient, ]
temp_df <- table( this_df$region, this_df[[celltype]] )
temp_df <- temp_df / rowSums(temp_df)
temp_df <- data.frame( temp_df)
temp_df$patient <- thispatient
temp_df$group <- unique( this_df[[group]])
df_plot <- rbind(df_plot, temp_df)
}
df_plot_region <- df_plot[df_plot$Var1 == select_region , ]
df_plot_region <- df_plot_region [ df_plot_region$group %in% group_of_interest, ]
p <- ggplot(df_plot_region, aes(x = Var2, y = Freq, colour = group)) +
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab("Proportion") + xlab("Cell type") + ggtitle("Selected region") + ylim(0, 0.25)
return(p)
}Overview
The recent development of spatial omics allows us to examine the spatial organisation of cells within tissues in their native environment. This workshop will discuss the challenges and analytical skills associated working with multi-sample spatial datasets and their use in disease risk prognosis. We will also introduce general analytical strategies and critical thinking questions that arise in the workflow.
Preparation and assumed knowledge
- Knowledge of R syntax
- Familiarity with the SingleCellExperiment class
- Basic knowledge in single cell data analysis. You can access our previous workshops for a quick review in single cell data analysis.
- Ability to install all required R packages, please check
sessionInfoat the end of this document to ensure you are using the correct version. - Familiarity with our previous workshop vignette on Introduction to Single Cell RNA-seq Analysis
Learning Objectives
- Describe and visualise spatial omics datasets
- Calculate different spatial statistics at the cell-type level using
scFeatures - Perform multi-view disease outcome prognosis with the package
ClassifyR - Understand and perform cell segmentation using
BIDCell - Develop an understanding of:
- evaluation of classification and survival models
- evaluate cohort heterogeneity given a survival model
- evaluation of classification and survival models
- Explore various strategies for disease outcome prognosis using
spatial omics data
Note: This data workshop offers participants the
opportunity to engage in hands-on analysis using R. However, if you are
not comfortable with coding in R, you can still acquire valuable
analytical skills by reviewing the output we provide in this file.
Time outline
Structure of the 3-hour workshop:
| Activity | Time |
|---|---|
| Part 1: Introduction | 20m |
| Part 2: Exploring spatial data | 20m |
| Part 3: Feature engineering with scFeatures | 30m |
| Break | 15m |
| Part 4: Survival analysis with ClassifyR | 15m |
| Part 5. Identify cohort heterogeneity | 15m |
| Part 6: Cell segmentation with BIDCell | 20m |
| Concluding remarks |
Part 2: Exploring spatial data
Data and background
A subset of the widely-known METABRIC breast cancer cohort has recently been analyzed using imaging mass cytometry: Imaging Mass Cytometry and Multiplatform Genomics Define the Phenogenomic Landscape of Breast Cancer. The question we are asking is: can their risk of recurrence be accurately estimated and therefore inform how aggressively they need to be treated? The other component of the analysis is patient clinical data, which has been sourced from Supplementary Table 5 of Dynamics of Breast-cancer Relapse Reveal Late-recurring ER-positive Genomic Subgroups.
## Quick look at data
assay(data_sce)[1:5, 1:5]## MB-0002:345:93 MB-0002:345:107 MB-0002:345:113 MB-0002:345:114
## HH3_total 4.82032390 3.8229411 2.55845170 4.82208870
## CK19 1.21185160 1.3223530 0.13832258 0.33366668
## CK8_18 2.80274720 2.4720588 0.60545164 2.43351100
## Twist 0.14651649 0.1176471 0.09877419 0.20791112
## CD68 0.07863736 0.1016471 0.03225806 0.05237778
## MB-0002:345:125
## HH3_total 2.41391110
## CK19 0.07055555
## CK8_18 0.20724444
## Twist 0.12004445
## CD68 0.00000000
Basic characteristics of the data objects:
- The dataset contains 38 proteins and 76307 cells.
- The outcome is recurrence-free survival.
Covariates
DT::datatable(data.frame(colData(data_sce)))